BrainSpan normalised dataset (http://www.brainspan.org/static/download.html)

Downloaded file RNA-Seq Gencode v3c summarized to genes containing ‘normalized RPKM expression values employing a historical normalization method’

Load data

# Load csvs (Downloaded from 'RNA-Seq Gencode v10 summarized to genes' in http://www.brainspan.org/static/download.html)
datExpr = read.csv('./../Data/BrainSpan/expression_matrix.csv', header=FALSE)
datMeta = read.csv('./../Data/BrainSpan/columns_metadata.csv')
geneInfo = read.csv('./../Data/BrainSpan/rows_metadata.csv')

# Remove index column in datExpr
cols = datExpr %>% colnames
datExpr = datExpr %>% dplyr::select(-V1)
colnames(datExpr) = cols[-length(cols)]

# Make sure rows match
if(!all(rownames(datExpr) == geneInfo$row_num)){
 stop('Columns in datExpr don\'t match the rows in datMeta!') 
}

# Assign row names
rownames(datMeta) = paste0('V', datMeta$column_num)
rownames(datExpr) = geneInfo$ensembl_gene_id

# Annotate probes
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
            'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
               dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
datGenes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
datGenes = datGenes[match(rownames(datExpr), datGenes$ensembl_gene_id),]
datGenes$length = datGenes$end_position-datGenes$start_position


# Group brain regions by lobes
datMeta$Brain_Region = as.factor(datMeta$structure_acronym)
datMeta$Brain_lobe = 'Other'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('Ocx','V1C')] = 'Occipital'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('M1C-S1C','DFC','OFC','VFC','M1C')] = 'Frontal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('PCx','IPC', 'S1C')] = 'Parietal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('AMY','MGE','STC','ITC','HIP','TCx','A1C')] = 'Temporal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('MFC')] = 'Limbic'
datMeta$Brain_lobe=factor(datMeta$Brain_lobe, levels=c('Frontal', 'Temporal', 'Parietal', 'Occipital','Limbic','Other'))

# Create complementary age variables
datMeta = datMeta %>% mutate('age_stage' = gsub('.* ','',age))
datMeta = datMeta %>% mutate('age_numeric' = case_when(grepl('pcw', age) ~ as.numeric(gsub(' pcw','',age))/52,
                                                       grepl('mos', age) ~ as.numeric(gsub(' mos','',age))/12,
                                                       TRUE ~ as.numeric(gsub(' yrs','',age)) ))


# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_with_ensembl_IDs.csv')
## Parsed with column specification:
## cols(
##   status = col_double(),
##   `gene-symbol` = col_character(),
##   `gene-name` = col_character(),
##   chromosome = col_character(),
##   `genetic-category` = col_character(),
##   `gene-score` = col_double(),
##   syndromic = col_double(),
##   `number-of-reports` = col_double(),
##   ID = col_character()
## )
# GO Annotations
GO_annotations = read.csv('./../Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID' = as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal' = 1)


# Combine SFARI and GO information
gene_info = data.frame('ID'=rownames(datExpr)) %>% left_join(SFARI_genes, by='ID') %>% 
            mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
            left_join(GO_neuronal, by='ID') %>% 
            mutate('gene.score'=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`)) %>%
            mutate('gene.score'=ifelse(is.na(gene.score), 'None', gene.score))


rm(cols, getinfo, mart, GO_annotations, geneInfo)


Remove faulty genes

to_keep = !is.na(datGenes$length)
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
gene_info = gene_info[to_keep,]
rownames(datGenes) = datGenes$ensembl_gene_id

Check sample distribution

RNA-Seq for 524 cortical brain-tissue samples across 26 brain regions belonging to 42 subjects

print(paste0('Dataset includes ', nrow(datExpr), ' genes from ', ncol(datExpr), ' samples belonging to ', length(unique(datMeta$donor_id)), ' different subjects.'))
## [1] "Dataset includes 49089 genes from 524 samples belonging to 42 different subjects."


Brain region distribution:

table(datMeta$Brain_Region)
## 
##     A1C     AMY      CB     CBC     CGE     DFC     DTH     HIP     IPC 
##      31      33       3      29       2      35       5      32      33 
##     ITC     LGE     M1C M1C-S1C      MD     MFC     MGE     Ocx     OFC 
##      34       2      26       5      24      32       2       2      31 
##     PCx     S1C     STC     STR     TCx     URL     V1C     VFC 
##       2      26      36      28       1       2      33      35

Grouped by lobe:

table(datMeta$Brain_lobe)
## 
##   Frontal  Temporal  Parietal Occipital    Limbic     Other 
##       132       169        61        35        32        95


Sex distribution: Fairly balanced

table(datMeta$gender)
## 
##   F   M 
## 226 298


Age distribution:

summary(datMeta$age)
##  1 yrs 10 mos 11 yrs 12 pcw 13 pcw 13 yrs 15 yrs 16 pcw 17 pcw 18 yrs 
##     16     10     14     45     44     16      5     39     14     13 
## 19 pcw 19 yrs  2 yrs 21 pcw 21 yrs 23 yrs 24 pcw 25 pcw 26 pcw  3 yrs 
##     11     16     12     16     16     14     16      1      3     25 
## 30 yrs 35 pcw 36 yrs 37 pcw 37 yrs  4 mos  4 yrs 40 yrs  8 pcw  8 yrs 
##     16      2     16     16     16     33      7     15     16     27 
##  9 pcw 
##     14



Filtering

  1. Filter genes with low expression levels

There is a big concentration of genes with very low values and a threshold of 0.5 seems to separate them from the rest of the data.

mean_expr = data.frame('ID'=rownames(datExpr),'Mean'=rowMeans(datExpr, na.rm=T),
                       'SD'=apply(datExpr,1,function(x) sd(x, na.rm=T)))

mean_expr %>% ggplot(aes(Mean+1)) + geom_density(fill='#0099cc', color='#0099cc', alpha=0.5) + 
     geom_vline(xintercept=1.5, color='gray', linetype='dashed') + scale_x_log10() + theme_minimal()

rm(mean_expr)

Filtering out genes with a mean expression lower than 0.5

to_keep = rowMeans(datExpr, na.rm=TRUE)>0.5
datExpr = datExpr[to_keep,]

print(paste0('Removed ', sum(!to_keep, na.rm=T), ' genes, ', sum(to_keep, na.rm=T), ' remaining'))
## [1] "Removed 30253 genes, 18836 remaining"
rm(to_keep)


  1. Filter outlier samples

Using node connectivity as a distance measure, normalising it and filtering out genes farther away than 2 standard deviations from the left (lower connectivity than average, not higher)

absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))

plot_data = data.frame('ID'=rownames(datMeta), 'sample'=1:length(z.ku), 'distance'=z.ku, 'brainLobe'=datMeta$Brain_lobe)
ggplotly(plot_data %>% ggplot(aes(sample, distance, color=brainLobe)) + geom_point() + 
         geom_hline(yintercept=-2, color='gray') + theme_minimal())
print(paste0('Outlier samples: ', paste(as.character(plot_data$ID[plot_data$distance< -2]), collapse=', ')))
## [1] "Outlier samples: 1, 9, 10, 11, 14, 23, 28, 116, 117, 177, 227, 237, 277, 297, 298, 299, 300, 301, 302, 303, 305, 306, 307, 308, 337, 338, 378, 509, 516, 522"
to_keep = z.ku > -2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]

print(paste0('Removed ', sum(!to_keep), ' samples, ', sum(to_keep), ' remaining'))
## [1] "Removed 30 samples, 494 remaining"
rm(absadj, netsummary, ku, z.ku, plot_data, to_keep)
print(paste0('After filtering, the dataset consists of ', nrow(datExpr), ' genes and ', ncol(datExpr), ' samples'))
## [1] "After filtering, the dataset consists of 18836 genes and 494 samples"



Visualisations


Log transforming the data to help visualisations

datExpr_transformed = log2(datExpr+1)

Samples

PCA: The main recognisable pattern separates the samples by age, with the biggest separation between preconception and postconception samples. Brain lobe doesn’t seem to be an important factor

pca = datExpr_transformed %>% t %>% prcomp

plot_data = data.frame('ID'=as.numeric(gsub('V','',colnames(datExpr))), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>% 
            left_join(datMeta, by=c('ID'='column_num')) %>%
            dplyr::select('PC1','PC2','age_numeric','age_stage','Brain_lobe','gender')

selectable_scatter_plot(plot_data, plot_data[,-c(1,2)])
rm(pca, plot_data)


Genes

  • First Principal Component explains 81% of the total variance

  • There’s a really strong (negative) correlation between the mean expression of a gene and the 1st principal component

pca = datExpr_transformed %>% prcomp

plot_data = data.frame('PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'PC3' = pca$x[,3], 
                       'MeanExpr'=rowMeans(datExpr_transformed), 'SD'=apply(datExpr_transformed,1,sd))

plot_data %>% ggplot(aes(PC1, PC2, color=MeanExpr)) + geom_point(alpha=0.3) + theme_minimal() + 
              scale_color_viridis() + ggtitle('PCA') +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))

rm(pca, plot_data)



Mean and SD by SFARI score

Original data

The higher the SFARI score the higher the mean expression, and probably the higher the standard deviation, although that relation is not as clear

plot_data = data.frame('ID'=sub('\\..*', '', rownames(datExpr)),
                       'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>% 
            left_join(gene_info, by='ID')

p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
              theme(legend.position='none'))

p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
              ggtitle('Mean Expression (left) and SD (right) by SFARI score') + 
              theme(legend.position='none'))

subplot(p1, p2, nrows=1)
rm(plot_data, p1, p2)



Log transformed data

The higher the SFARI score the higher the mean expression, and probably the lower the standard deviation as well, although that relation is not as clear, again.

plot_data = data.frame('ID'=sub('\\..*', '', rownames(datExpr_transformed)),
                       'MeanExpr'=rowMeans(datExpr_transformed),
                       'SDExpr'=apply(datExpr_transformed,1,sd)) %>% 
            left_join(gene_info, by='ID')

p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
              theme(legend.position='none'))

p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
              ggtitle('Mean Expression (left) and SD (right) by SFARI score') + 
              theme(legend.position='none'))

subplot(p1, p2, nrows=1)
rm(plot_data, p1, p2)

Save cleaned dataset

save(datExpr, datMeta, datGenes, gene_info, file='./../Data/BrainSpan/cleaned_data.RData')
#load('./../Data/Gupta/preprocessed_data.RData')



Session info

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] glue_1.3.1             Rtsne_0.15             vsn_3.50.0            
##  [4] Biobase_2.42.0         BiocGenerics_0.28.0    WGCNA_1.66            
##  [7] fastcluster_1.1.25     dynamicTreeCut_1.63-1  forcats_0.3.0         
## [10] stringr_1.4.0          dplyr_0.8.0.1          purrr_0.3.1           
## [13] readr_1.3.1            tidyr_0.8.3            tibble_2.1.1          
## [16] tidyverse_1.2.1        viridis_0.5.1          viridisLite_0.3.0     
## [19] gridExtra_2.3          plotlyutils_0.0.0.9000 plotly_4.8.0          
## [22] ggplot2_3.1.0          biomaRt_2.38.0        
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_1.4-1      htmlTable_1.11.2      base64enc_0.1-3      
##   [4] rstudioapi_0.7        affyio_1.52.0         bit64_0.9-7          
##   [7] AnnotationDbi_1.44.0  mvtnorm_1.0-7         lubridate_1.7.4      
##  [10] xml2_1.2.0            codetools_0.2-15      splines_3.5.2        
##  [13] doParallel_1.0.14     impute_1.56.0         robustbase_0.93-0    
##  [16] knitr_1.22            Formula_1.2-3         jsonlite_1.6         
##  [19] Cairo_1.5-9           broom_0.5.1           cluster_2.0.7-1      
##  [22] GO.db_3.7.0           shiny_1.2.0           BiocManager_1.30.4   
##  [25] rrcov_1.4-3           compiler_3.5.2        httr_1.4.0           
##  [28] backports_1.1.2       assertthat_0.2.1      Matrix_1.2-15        
##  [31] lazyeval_0.2.2        limma_3.38.3          cli_1.1.0            
##  [34] later_0.8.0           acepack_1.4.1         htmltools_0.3.6      
##  [37] prettyunits_1.0.2     tools_3.5.2           gtable_0.2.0         
##  [40] affy_1.60.0           Rcpp_1.0.1            cellranger_1.1.0     
##  [43] preprocessCore_1.44.0 nlme_3.1-137          crosstalk_1.0.0      
##  [46] iterators_1.0.9       xfun_0.5              rvest_0.3.2          
##  [49] mime_0.6              XML_3.98-1.11         DEoptimR_1.0-8       
##  [52] MASS_7.3-51.1         zlibbioc_1.28.0       scales_1.0.0         
##  [55] promises_1.0.1        hms_0.4.2             RColorBrewer_1.1-2   
##  [58] curl_3.3              yaml_2.2.0            memoise_1.1.0        
##  [61] rpart_4.1-13          latticeExtra_0.6-28   stringi_1.4.3        
##  [64] RSQLite_2.1.1         S4Vectors_0.20.1      pcaPP_1.9-73         
##  [67] foreach_1.4.4         checkmate_1.8.5       rlang_0.3.2          
##  [70] pkgconfig_2.0.2       matrixStats_0.54.0    bitops_1.0-6         
##  [73] evaluate_0.13         lattice_0.20-38       labeling_0.3         
##  [76] htmlwidgets_1.3       bit_1.1-14            tidyselect_0.2.5     
##  [79] robust_0.4-18         plyr_1.8.4            magrittr_1.5         
##  [82] R6_2.4.0              IRanges_2.16.0        generics_0.0.2       
##  [85] Hmisc_4.1-1           fit.models_0.5-14     DBI_1.0.0            
##  [88] pillar_1.3.1          haven_1.1.1           foreign_0.8-71       
##  [91] withr_2.1.2           survival_2.43-3       RCurl_1.95-4.10      
##  [94] nnet_7.3-12           modelr_0.1.4          crayon_1.3.4         
##  [97] rmarkdown_1.12        progress_1.2.0        grid_3.5.2           
## [100] readxl_1.1.0          data.table_1.12.0     blob_1.1.1           
## [103] digest_0.6.18         xtable_1.8-3          httpuv_1.5.0         
## [106] stats4_3.5.2          munsell_0.5.0